# Import necessary Python modules.
from pycaret.anomaly import *
import pandas as pd
import missingno as miss
from plotnine import *
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
# Add additional settings to notebook.
warnings.filterwarnings("ignore")
# Read in all subaward data.
data_sub_awards_all = \
(pd
.read_csv("../data/All_Subawards_2022-04-12_H19M40S42878092/All_Contracts_Subawards_2022-04-12_H19M40S39_1.csv")
)
# Check size of all subaward data.
data_sub_awards_all.shape
(162207, 107)
# Check types of features.
data_sub_awards_all.dtypes
prime_award_unique_key object prime_award_piid object prime_award_parent_piid object prime_award_amount float64 prime_award_disaster_emergency_fund_codes object prime_award_outlayed_amount_funded_by_COVID-19_supplementals float64 prime_award_obligated_amount_funded_by_COVID-19_supplementals float64 prime_award_base_action_date object prime_award_base_action_date_fiscal_year int64 prime_award_latest_action_date object prime_award_latest_action_date_fiscal_year int64 prime_award_period_of_performance_start_date object prime_award_period_of_performance_current_end_date object prime_award_period_of_performance_potential_end_date object prime_award_awarding_agency_code int64 prime_award_awarding_agency_name object prime_award_awarding_sub_agency_code object prime_award_awarding_sub_agency_name object prime_award_awarding_office_code object prime_award_awarding_office_name object prime_award_funding_agency_code int64 prime_award_funding_agency_name object prime_award_funding_sub_agency_code object prime_award_funding_sub_agency_name object prime_award_funding_office_code object prime_award_funding_office_name object prime_award_treasury_accounts_funding_this_award object prime_award_federal_accounts_funding_this_award object prime_award_object_classes_funding_this_award object prime_award_program_activities_funding_this_award object prime_awardee_uei object prime_awardee_duns float64 prime_awardee_name object prime_awardee_dba_name object prime_awardee_parent_uei object prime_awardee_parent_duns float64 prime_awardee_parent_name object prime_awardee_country_code object prime_awardee_country_name object prime_awardee_address_line_1 object prime_awardee_city_name object prime_awardee_county_name object prime_awardee_state_code object prime_awardee_state_name object prime_awardee_zip_code float64 prime_awardee_congressional_district object prime_awardee_foreign_postal_code object prime_awardee_business_types object prime_award_primary_place_of_performance_city_name object prime_award_primary_place_of_performance_state_code object prime_award_primary_place_of_performance_state_name object prime_award_primary_place_of_performance_address_zip_code object prime_award_primary_place_of_performance_congressional_district float64 prime_award_primary_place_of_performance_country_code object prime_award_primary_place_of_performance_country_name object prime_award_description object prime_award_project_title object prime_award_naics_code int64 prime_award_naics_description object prime_award_national_interest_action_code object prime_award_national_interest_action object subaward_type object subaward_fsrs_report_id object subaward_fsrs_report_year int64 subaward_fsrs_report_month int64 subaward_number object subaward_amount float64 subaward_action_date object subaward_action_date_fiscal_year int64 subawardee_uei object subawardee_duns float64 subawardee_name object subawardee_dba_name object subawardee_parent_uei object subawardee_parent_duns float64 subawardee_parent_name object subawardee_country_code object subawardee_country_name object subawardee_address_line_1 object subawardee_city_name object subawardee_state_code object subawardee_state_name object subawardee_zip_code object subawardee_congressional_district object subawardee_foreign_postal_code object subawardee_business_types object subaward_primary_place_of_performance_address_line_1 float64 subaward_primary_place_of_performance_city_name object subaward_primary_place_of_performance_state_code object subaward_primary_place_of_performance_state_name object subaward_primary_place_of_performance_address_zip_code object subaward_primary_place_of_performance_congressional_district object subaward_primary_place_of_performance_country_code object subaward_primary_place_of_performance_country_name object subaward_description object subawardee_highly_compensated_officer_1_name object subawardee_highly_compensated_officer_1_amount float64 subawardee_highly_compensated_officer_2_name object subawardee_highly_compensated_officer_2_amount float64 subawardee_highly_compensated_officer_3_name object subawardee_highly_compensated_officer_3_amount float64 subawardee_highly_compensated_officer_4_name object subawardee_highly_compensated_officer_4_amount float64 subawardee_highly_compensated_officer_5_name object subawardee_highly_compensated_officer_5_amount float64 usaspending_permalink object subaward_fsrs_report_last_modified_date object dtype: object
# Collect columns of interest.
data_sub_awards_all = data_sub_awards_all[["prime_award_description",
"prime_award_awarding_sub_agency_name",
"prime_award_awarding_office_code",
"prime_award_awarding_office_name",
"prime_award_funding_sub_agency_name",
"prime_award_funding_office_name",
"prime_awardee_parent_name",
"subaward_fsrs_report_year",
"subaward_fsrs_report_month",
"subaward_amount",
"subaward_action_date",
"subaward_action_date_fiscal_year",
"subawardee_name",
"subawardee_duns",
"subawardee_business_types",
"subaward_description"]]
# We now check for missingness in the data.
miss.matrix(data_sub_awards_all)
<AxesSubplot:>
# Examine the missing values of the data.
(data_sub_awards_all
.isna() # Find all missing values.
.sum() # Sum across columns.
.reset_index(name = "count") # Reset the index of the dataframe.
.rename(columns = {"index": "variable"}) # Rename columns.
.sort_values(by = ["count"], ascending = False) # Sort missing values count in descending order.
)
| variable | count | |
|---|---|---|
| 14 | subawardee_business_types | 9020 |
| 6 | prime_awardee_parent_name | 5875 |
| 5 | prime_award_funding_office_name | 525 |
| 13 | subawardee_duns | 491 |
| 15 | subaward_description | 31 |
| 12 | subawardee_name | 18 |
| 0 | prime_award_description | 0 |
| 1 | prime_award_awarding_sub_agency_name | 0 |
| 2 | prime_award_awarding_office_code | 0 |
| 3 | prime_award_awarding_office_name | 0 |
| 4 | prime_award_funding_sub_agency_name | 0 |
| 7 | subaward_fsrs_report_year | 0 |
| 8 | subaward_fsrs_report_month | 0 |
| 9 | subaward_amount | 0 |
| 10 | subaward_action_date | 0 |
| 11 | subaward_action_date_fiscal_year | 0 |
# Now we check to see how many observations have missing values.
# Get the number of rows before dropping missing values.
num_before_drop = (data_sub_awards_all
.shape[0] # Get the number of rows in the data frame.
)
# get the number of rows after dropping missing values.
num_after_drop = (data_sub_awards_all
.dropna() # Drop all rows with NA values.
.shape[0] # Get the number of rows in the resulting data frame.
)
# Get how many rows were dropped for having missing values.
num_missing_obs = num_before_drop - num_after_drop
# Print the percentage of rows in the original data frame that have missing values.
print(f"Percent of Observations with NA Values: {round(100 * num_missing_obs / num_before_drop, 3)}%")
Percent of Observations with NA Values: 9.397%
# Drop all rows with missing values.
data_sub_awards_all = data_sub_awards_all.dropna()
# Check the dimensionality of the resulting dataframe after dropping missing values.
data_sub_awards_all.shape
(146964, 16)
# Collect subaward amount mean and subaward amount count by DUNS.
analysis = \
(data_sub_awards_all[["subawardee_duns", "subaward_amount"]]
.groupby(["subawardee_duns"])
.agg({
"subaward_amount" : ["mean", "count"]
})
.reset_index()
.rename(columns = {"" : "subawardee_duns",
"mean" : "subaward_amount_mean",
"count" : "subaward_amount_count"
}, level = 1)
.droplevel(0, axis = 1)
)
# Merge data together.
data_sub_awards_all = data_sub_awards_all.merge(analysis, how = "left", on = "subawardee_duns")
# Only collect hypersonic subawards.
data_sub_awards_hyper = \
data_sub_awards_all[(data_sub_awards_all["prime_award_description"]
.str
.lower()
.str
.contains("hypersonic", regex = False)
)]
# Check the dimensions of the dataframe.
data_sub_awards_hyper.shape
(103, 18)
# Drop *prime_award_description* columns from dataframes.
data_sub_awards_all = data_sub_awards_all.drop(columns = ["prime_award_description"])
data_sub_awards_hyper = data_sub_awards_hyper.drop(columns = ["prime_award_description"])
# Make copy of dataframes.
data_sub_awards_all_copy = data_sub_awards_all.copy()
data_sub_awards_hyper_copy = data_sub_awards_hyper.copy()
# Collect variables to convert to categories.
cols_to_convert = pd.DataFrame({"columns" : data_sub_awards_hyper.select_dtypes(include = ["int", "object"]).columns})
# Remove "subaward_amount" from columns to convert and make into list to push into for loop.
cols_to_convert = cols_to_convert[~cols_to_convert["columns"].str.contains("amount")]["columns"].values.tolist()
# Add *subawardee_duns* to list of variables to convert to categories.
cols_to_convert.append("subawardee_duns")
# Convert variables to categories.
for column in cols_to_convert:
data_sub_awards_hyper[column] = data_sub_awards_hyper[column].astype("category")
# Break *subaward_amount* into categories.
# Identify the maximum *subaward_amount* in the data.
max_value = data_sub_awards_hyper.loc[data_sub_awards_hyper["subaward_amount"] > 0, "subaward_amount"].max()
# Split into 5 bins.
bins = np.linspace(0, max_value, 6).round(0)
# Create array of lower bounds for bins.
lower_bins = np.insert(bins, 0, 0)
# Create array for uppder bounds for bins.
upper_bins = np.insert(bins, 5, np.Inf)
# Create dataframe to show the bounds for the bins.
pd.DataFrame({"Lower Bin Bound" : lower_bins,
"Upper Bin Bound" : upper_bins
}).loc[1:5, :].reset_index().drop(columns = ["index"])
| Lower Bin Bound | Upper Bin Bound | |
|---|---|---|
| 0 | 0.0 | 102983.0 |
| 1 | 102983.0 | 205967.0 |
| 2 | 205967.0 | 308950.0 |
| 3 | 308950.0 | 411933.0 |
| 4 | 411933.0 | inf |
# First bin.
bin_0 = bins[1]
# Second bin.
bin_1 = bins[2]
# Third bin.
bin_2 = bins[3]
# Fourth bin.
bin_3 = bins[4]
# Fifth bin.
bin_4 = bins[5]
# We create the categorical variable for *subaward_amount*.
data_sub_awards_hyper["subaward_amount_cat"] = \
np.where(data_sub_awards_hyper["subaward_amount"] <= bin_0, 0,
np.where((data_sub_awards_hyper["subaward_amount"] > bin_0) & \
(data_sub_awards_hyper["subaward_amount"] <= bin_1), 1,
np.where((data_sub_awards_hyper["subaward_amount"] > bin_1) & \
(data_sub_awards_hyper["subaward_amount"] <= bin_2), 2,
np.where((data_sub_awards_hyper["subaward_amount"] > bin_2) & \
(data_sub_awards_hyper["subaward_amount"] <= bin_3), 3,
np.where((data_sub_awards_hyper["subaward_amount"] > bin_3) & \
(data_sub_awards_hyper["subaward_amount"] <= bin_4), 4, 5)))))
# Drop *subaward_amount* from dataframe.
data_sub_awards_hyper = data_sub_awards_hyper.drop(columns = ["subaward_amount"])
# Convert *subaward_amount_cat* to categorical.
data_sub_awards_hyper["subaward_amount_cat"] = data_sub_awards_hyper["subaward_amount_cat"].astype("category")
# Plot the ordinal variable to observe the distribution.
# Identify the continuous variables.
data_sub_awards_hyper_ord = data_sub_awards_hyper.loc[:, data_sub_awards_hyper.columns.str.endswith("_cat")].melt()
# Plot the ordinal variables.
(
ggplot(data_sub_awards_hyper_ord, aes(x = "value")) +
geom_histogram(bins = 25) +
facet_wrap("variable", scales = "free") +
theme(figure_size = (10, 3),
subplots_adjust = {"wspace" : 0.25}) +
labs(x = "Value", y = "Count", title = "Visualize Distribution of Ordinal Variable")
)
<ggplot: (8761583250011)>
# We define a function to convert variables to categorical variables and return the altered dataframe.
def conv_cat_dummy(df, var, baseline):
"""
The conv_cat_dummy function converts a variable to a categorical variable and returns the altered dataframe.
Arguments
-----
df: Pandas DataFrame
A DataFrame of all true values of the target variable, "price".
var: string
A string of the variable's name to be converted to a categorical variable.
baseline: string
A string of the variable's baseline to be considered in the convertion to a categorical variable.
return
-----
A Pandas DataFrame with the converted categorical variable.
"""
# Get the dummy variables for given variable.
dummies = pd.get_dummies(df[var])
# Replace spaces in the dummy column names.
dummies.columns = [str(c).lower().replace(" ", "_") for c in dummies.columns]
# Drop baseline since we are treating that as our baseline.
dummies = dummies.drop([str(baseline).lower().replace(" ", "_")], axis = 1) # Baseline
# Concatanate the results to push in to the dummy columns for given variable.
return pd.concat([df.drop([var], axis = 1), dummies], axis = 1)
# Collect columns that need to be converted to dummy variables.
cols_to_dummy = data_sub_awards_hyper.columns.tolist()
# Remove *subaward_amount_cat*, *subaward_amount_mean*, *subaward_amount_count* from list to be converted to dummy.
cols_to_dummy = [x for x in cols_to_dummy if x not in ["subaward_amount_cat",
"subaward_amount_mean",
"subaward_amount_count"]]
# Iterate through each column that needs to be converted to dummy.
for column in cols_to_dummy:
# Identify baseline.
default_to_remove = data_sub_awards_hyper[column].unique()[0]
# Save new dataframe with converted categorical variables to dummies.
data_sub_awards_hyper = conv_cat_dummy(data_sub_awards_hyper, column, default_to_remove)
# Plot subaward amount average against subaward amount count.
plt.scatter(data_sub_awards_hyper["subaward_amount_mean"], data_sub_awards_hyper["subaward_amount_count"])
# Show plot.
plt.show()
# Declare K-Means Clustering model.
model = KMeans(n_clusters = 5)
# Fit model to data.
model.fit(data_sub_awards_hyper)
KMeans(n_clusters=5)
# Get predictions on the entire data set.
all_predictions = model.predict(data_sub_awards_hyper)
# Plot subaward amount average against subaward amount count with prediction coloring.
plt.scatter(data_sub_awards_hyper["subaward_amount_mean"],
data_sub_awards_hyper["subaward_amount_count"],
c = all_predictions,
cmap = "viridis"
)
# Show plot.
plt.show()
# Define T-SNE Clustering model.
model = TSNE(learning_rate = 100, random_state = 999)
# Fit model to data.
transformed = model.fit_transform(data_sub_awards_hyper)
# Plot the 2D T-SNE.
x_axis = transformed[:, 0]
y_axis = transformed[:, 1]
# Plot transformed data that is now represented in a two-dimensional figure.
plt.scatter(x_axis, y_axis, c = all_predictions, cmap = "viridis")
# Show plot.
plt.show()
# Set up Pycaret using the original copy of the data.
anom = setup(data = data_sub_awards_hyper_copy, silent = True, session_id = 999)
| Description | Value | |
|---|---|---|
| 0 | session_id | 999 |
| 1 | Original Data | (103, 17) |
| 2 | Missing Values | False |
| 3 | Numeric Features | 4 |
| 4 | Categorical Features | 12 |
| 5 | Ordinal Features | False |
| 6 | High Cardinality Features | False |
| 7 | High Cardinality Method | None |
| 8 | Transformed Data | (103, 201) |
| 9 | CPU Jobs | -1 |
| 10 | Use GPU | False |
| 11 | Log Experiment | False |
| 12 | Experiment Name | anomaly-default-name |
| 13 | USI | 4b06 |
| 14 | Imputation Type | simple |
| 15 | Iterative Imputation Iteration | None |
| 16 | Numeric Imputer | mean |
| 17 | Iterative Imputation Numeric Model | None |
| 18 | Categorical Imputer | mode |
| 19 | Iterative Imputation Categorical Model | None |
| 20 | Unknown Categoricals Handling | least_frequent |
| 21 | Normalize | False |
| 22 | Normalize Method | None |
| 23 | Transformation | False |
| 24 | Transformation Method | None |
| 25 | PCA | False |
| 26 | PCA Method | None |
| 27 | PCA Components | None |
| 28 | Ignore Low Variance | False |
| 29 | Combine Rare Levels | False |
| 30 | Rare Level Threshold | None |
| 31 | Numeric Binning | False |
| 32 | Remove Outliers | False |
| 33 | Outliers Threshold | None |
| 34 | Remove Multicollinearity | False |
| 35 | Multicollinearity Threshold | None |
| 36 | Remove Perfect Collinearity | False |
| 37 | Clustering | False |
| 38 | Clustering Iteration | None |
| 39 | Polynomial Features | False |
| 40 | Polynomial Degree | None |
| 41 | Trignometry Features | False |
| 42 | Polynomial Threshold | None |
| 43 | Group Features | False |
| 44 | Feature Selection | False |
| 45 | Feature Selection Method | classic |
| 46 | Features Selection Threshold | None |
| 47 | Feature Interaction | False |
| 48 | Feature Ratio | False |
| 49 | Interaction Threshold | None |
# Train the anomaly detection model.
anom_model = create_model(model = "iforest", fraction = 0.05)
# Score the dataset.
results = assign_model(anom_model)
# Plot the model to help us visualize high dimension data using TSNE.
plot_model(anom_model, plot = "tsne")
# Get anomaly predictions for the data we used.
data_predictions = predict_model(anom_model, data = data_sub_awards_hyper_copy)
# Identify only the anomalies.
pd.DataFrame({"Anomaly_Company" : data_predictions.query("Anomaly == 1").subawardee_name.unique()})
| Anomaly_Company | |
|---|---|
| 0 | CUBRC, INC. |
| 1 | INTERSTATE ELECTRONICS CORPORATION |
| 2 | HYSONIC TECHNOLOGIES, LLC |
| 3 | LOCKHEED MARTIN CORPORATION |
| 4 | SIRIUS FEDERAL, LLC |